---
title: "Replication File to Create Variables Used in 'Political Knowledge in the Context of Changing Institutions'"
author: "Sofia Vidotto, Rebecca Weitz-Shapiro and Matthew S. Winters"
date: "`r Sys.Date()`"
output: html_document
---

```{r}
library(plyr)
library(tidyverse)
library(foreach)
library(survey)
library(haven)
library(ggrepel)
library(readr)
library(ggplot2)
library(ggpubr)
library(cowplot)
library(gridGraphics)
library(grid)
library(gridExtra)
library(patchwork)
library(MASS)
library(sandwich)
library(lmtest)
library(lme4)
library(stargazer)
library(estimatr)
```

#LAPOP 
```{r}
# Load dataset
 lapop <- read_dta("Datasets/lapop_dataset.dta")
```

```{r}
# Recode demographic variables 

# GENDER
# Original variable(q1) --> Men =1, Women=2 
# New variable (gender) ---> Men=0, Women=1

lapop <- lapop %>%
    mutate(gender = car::recode(q1, "1=0;2=1;else=NA")) %>%
    mutate(gender = ordered(gender, levels = c(0, 1), labels = c("Men", "Women"))) %>%
    relocate(gender, .after = q1) 

# EDUCATION 
# Original variable (ed) - "¿What was the highest level of education you have completed? --> 0 to 18 (None= 0; Primary school = 2,3,4,5, or 6; Secondary school = 7,8,9,10,11, or 12; Post-secondary qualifications, other than university = 13,14,15; University = 13, 14,15, 16, 17, 18).
# New variable (edu) --> "Above the median" = 1, "Equal to or below the median"=0

lapop <- lapop %>%
   group_by(pais, year)%>%
   mutate(education_median = median(ed, na.rm = T)) %>%
   ungroup() %>%
   mutate(edu = ifelse(ed > education_median, 1, 0)) %>%
   mutate(edu = factor(edu, levels = c(0,1),
          labels = c("Equal to or below the median","Above the median")))%>%
  relocate(education_median, .after = ed)   %>%
  relocate(edu, .after = education_median) 

# AGE
# Original variable --> (q2)
# New variable (age) = same values as original variable (from 16 to 112)
lapop <- lapop %>%
mutate(age = q2)
```

```{r}
# Add a new variable
lapop <- lapop %>%
  mutate(countrywave = paste(pais, wave, sep = "_")) %>%
  relocate(countrywave, .after = year) 
```

```{r}
# Recode outcome variables
# Number of states
# Original variable --> gi3 - How many states does [country X] have? 
# Note on the label of variable 'states': It reads "Number of states in [country X]? correct-incorrect answers" (not "Number of states in the US")

# Presidential term
# Original variable --> gi4 - How long is a presidential term in [country X]? 

lapop <- lapop %>% 
  mutate(
    states = car::recode(str_remove(haven::as_factor(gi3),"'"),
      "'Correcto'=1; 'Incorrecto'=0; 'Dont Know'=0; else=NA"),
    term = car::recode(str_remove(haven::as_factor(gi4),"'"),
      "'Correct'=1; 'Incorrect'=0; 'Dont Know'=0; else=NA"),
    pais = haven::as_factor(pais)
  ) 
```

```{r}
# Original variable --> gi7 (representatives)- Size of lower house in [country X]? 
# "How many representatives does the [lower or only chamber of Congress] have? (Write the exact same number)"

# The question in the LAPOP data does not specify the correct answer in each country so we create a file with the right answers and use it to identify correct responses in the survey. The dataset "lower_house_last" includes correct answers for the number of representatives for all Latin American countries in 2012 and 2014.

 lower_house = read_csv("Datasets/Institutions variables/lower_house_last.csv")
lower_house %>%
  gather(key = "year", value = "correct", -country, -`2012`, -`2014`) %>%
  dplyr::rename(pais = country) %>%
  dplyr::select(pais, year, correct) %>%
  mutate(year = as.numeric(str_extract(year, "20[0-9][0-9]"))) -> tidy_lower_house

# Original variable: Representatives (gi7) - Size of lower house in [country X]? 
lapop <- lapop %>%
  mutate(reps = gi7) %>%
  left_join(tidy_lower_house) %>%
  mutate(reps_right = as.numeric(reps == correct),
         reps = ifelse(as_factor(reps) == "Don't Know", -99, reps))

# The 2009 constitution reform in Panama established that the National Assembly would have 71 deputies and one alternate.  We therefore accept either 71 or 72 as the correct answer to the number of members.
lapop <- lapop %>%
  mutate(reps_right = case_when((pais=="Colombia" & year==2012 & gi7==166) ~ 1,
    (pais=="Panama" & gi7==72) ~ 1,
    TRUE ~ reps_right))
```

### Add institutional variables 
```{r}
# The institutions datatsets include information about the last time each institution  (i.e., number of states, length of the presidential term, and the number of representatives in the Lower House) changed in a country
# states 
 states_inst <- read_csv("Datasets/Institutions variables/provs.csv")[,1:5] %>%
  gather(key = "year", value = "value", -country) %>%
  dplyr::rename(pais = country) %>%
  mutate(year = as.numeric(year)) %>%
  mutate(states_years_since = year - value)

# presidential term
pres_dt <- read_csv("Datasets/Institutions variables/pres.csv")

# Last time the rule changed
term_lrc_inst <- pres_dt %>%
  dplyr::select(Country, contains("lrc")) %>%
  gather(key = "year", value = "value", - Country) %>%
  mutate(year = str_extract(year, pattern = "20[0-9][0-9]")) %>%
  dplyr::rename(pais = Country) %>%
  mutate(year = as.numeric(year)) %>%
  mutate(term_years_since_lrc = year - value)

# Last time the rule changed or transition to democracy
term_lc_inst <- pres_dt %>%
  dplyr::select(Country, contains("lc")) %>%
  gather(key = "year", value = "value", - Country) %>%
  mutate(year = str_extract(year, pattern = "20[0-9][0-9]")) %>%
  dplyr::rename(pais = Country) %>%
  mutate(year = as.numeric(year)) %>%
  mutate(term_years_since_lc = year - value)

# Terms over completed terms since 1980
terms_inst <- pres_dt %>%
  dplyr::select(Country, contains("terms")) %>%
  gather(key = "year", value = "terms", - Country) %>%
  mutate(year = str_extract(year, pattern = "20[0-9][0-9]")) %>%
  dplyr::rename(pais = Country) %>%
  mutate(year = as.numeric(year))

complete_inst <- pres_dt %>%
  dplyr::select(Country, contains("complete")) %>%
  gather(key = "year", value = "complete", - Country) %>%
  mutate(year = str_extract(year, pattern = "20[0-9][0-9]")) %>%
  dplyr::rename(pais = Country) %>%
  mutate(year = as.numeric(year))

# representatives
reps_inst <- lower_house[,c(1:3)] %>%
  gather(key = "year", value = "value", -country) %>%
  dplyr::rename(pais = country) %>%
  mutate(year = as.numeric(year)) %>%
  mutate(reps_years_since = year - value) 

# Clean the dataset
lapop <- lapop %>%
  dplyr::select(-year) %>%  # Remove the 'year' column
  dplyr::rename(year = wave)  # Rename 'wave' to 'year'

# Merge the institutions data with the lapop data
lapop  <- lapop %>%
  left_join(states_inst, by=c("pais", "year")) %>%
  left_join(term_lrc_inst, by=c("pais", "year")) %>%
  left_join(term_lc_inst, by=c("pais", "year")) %>%
  left_join(terms_inst, by=c("pais", "year")) %>%
  left_join(complete_inst, by=c("pais", "year")) %>%
  left_join(reps_inst, by=c("pais", "year")) %>%
  mutate(complete_terms_ratio = complete/terms)

# Create asinh transformations of outcome variables (inverse hyperbolic sine transformation of years since last time institutions changed)
lapop <- lapop %>% 
  mutate(states_years_since_asinh = asinh(states_years_since),
         term_years_since_lrc_asinh = asinh(term_years_since_lrc),
         term_years_since_lc_asinh = asinh(term_years_since_lc), 
         reps_years_since_asinh = asinh(reps_years_since))
```

### Add country-level covariates
```{r}
# Select country level covariates for gdp per capita, and democracy (two alternative sources - Polity and VDEM)
# 1) GDP per capita - Data comes from the World Bank for 2004,2006,2008, 2010, 2012 and 2014, depending on the country. Original dataset is available at  https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.PCAP.PP.CD&country=

# 2) Democracy Index corresponds to Polity. Original dataset is available at (https://www.systemicpeace.org/polityproject.html). Dataset includes measures for Polity and Polity 2 for all countries to which the data was available (Bahamas, Belize, and Barbados are excluded due to lack of availability for the corresponding years). 

controls_lapop <- read_csv("Datasets/Control variables/controls_country_level_lapop.csv")[,1:7]

# log(GDP per capita) instead of GDP per capita
controls_lapop$gdp_log <- log(controls_lapop$gdp_per_capita)

# 3) Alternative measure of democratic level (VDEM). Original dataset available at (https://github.com/vdeminstitute/vdemdata). Variables selected for the analysis: 'v2x_partipdem' (participatory democracy), 'v2x_delibdem' (deliberative democracy), and 'v2x_polyarchy' (electoral democracy).y
# Participatory democracy index - v2x_partipdem -  Original question: To what extent is the ideal of participatory democracy achieved?
# Deliberative democracy index - v2x_delibdem - Original question: To what extent is the ideal of deliberative democracy achieved?
# Electoral democracy index - v2x_polyarchy - Original Question: To what extent is the ideal of electoral democracy in its fullest sense achieved?

# Load devtools package to install the vdemdata package if not already installed
if (!require(devtools)) install.packages("devtools")
library(devtools)
if (!require(vdemdata)) devtools::install_github("vdeminstitute/vdemdata")
library(vdemdata)
data("vdem")

vdem_small <- vdem[, c("country_name", "year", "v2x_partipdem", "v2x_delibdem", "v2x_polyarchy")]

vdem_small <- vdem_small %>%
  dplyr::rename(pais = country_name, partipdem = v2x_partipdem, delibdem = v2x_delibdem, polyarchy = v2x_polyarchy)

# Replace "United States of America" with "United States" and "Trinidad and Tobago" with "Trinidad & Tobago" in 'pais' for the merge 
vdem_small <- vdem_small %>%
  mutate(pais = str_replace(pais, "United States of America", "United States")) %>%
  mutate(pais = str_replace(pais, "Trinidad and Tobago", "Trinidad & Tobago"))

# Merge 
controls_lapop <- controls_lapop %>% 
  left_join(vdem_small, by=c("pais", "year"))
lapop <- lapop %>% 
 left_join(controls_lapop, by=c("pais", "year"))
```

# Country-Year-Level Data

```{r}
# Subset for states
states_dt <- lapop %>%
  dplyr::select(year, pais, wt, states, states_years_since, states_years_since_asinh, gdp_log, polity2, partipdem, delibdem, polyarchy) %>% 
  filter(!is.na(states) & !is.na(states_years_since_asinh)) %>% # Remove rows where states AND states_years_since_asinh are NA
  svydesign(id = ~1, weights = ~wt, data = .)  

# Create means for the outcome variable 
states_means = svyby(~states + states_years_since + states_years_since_asinh + gdp_log + polity2 + partipdem + delibdem + polyarchy, by = ~year + pais, states_dt, svymean) 

# Subset for presidential term
term_dt <- lapop %>% 
  dplyr::select(year, pais, wt, term, term_years_since_lc, term_years_since_lrc, term_years_since_lc_asinh, term_years_since_lrc_asinh, gdp_log, polity2, partipdem, delibdem, polyarchy) %>% 
  filter(!is.na(term) & !is.na(term_years_since_lrc_asinh)) %>% # Remove rows where term AND term_years_since_lrc_asinh are NA
  svydesign(id = ~1, weights = ~wt, data = .)

term_means = svyby(~term + term_years_since_lrc + term_years_since_lrc + term_years_since_lc_asinh + term_years_since_lrc_asinh + gdp_log + polity2 + partipdem + delibdem + polyarchy, by = ~year + pais, term_dt, svymean)

# Subset for number of representatives in the Lower House
reps_dt <- lapop %>%
    dplyr::select(year, pais, wt, reps_right, reps_years_since, reps_years_since_asinh, gdp_log, polity2, partipdem, delibdem, polyarchy)%>%
  filter(!is.na(reps_right) & !is.na(reps_years_since_asinh)) %>%  # Remove rows where reps_right AND reps_years_since_asinh are NA
  svydesign(id = ~1, weights = ~wt, data = .)

reps_means = svyby(~reps_right + reps_years_since + reps_years_since_asinh + gdp_log + polity2 + partipdem + delibdem + polyarchy, by = ~year + pais, reps_dt, svymean) 

```

```{r}
# Save the data 
data_objects <- list(states_means = states_means,
                    term_means = term_means,
                    reps_means = reps_means,
                    dt2 = lapop)
for (data_name in names(data_objects)) {
  file_path <- paste0("Datasets/Processed datasets/lapop/", data_name, ".csv")
  write.csv(data_objects[[data_name]], file_path, row.names = FALSE)
}
```

# AFROBAROMETER
```{r}
afrob <- read_dta("Datasets/afrobarometer_dataset.dta")
```

```{r}
# Recode demographic variables

# GENDER
# Original variable(gender) --> Men =1, and Women =2, 99=DKs
# New variable (gender) --> Men=0,Women=1, 99=NA

afrob <- afrob %>%
  mutate(
    gender = as.numeric(gender),
    gender = recode(gender, `1` = 0, `2` = 1, .default = NA_real_), #.default = NA_real_ is specified to handle any values that do not match 1 or 2, explicitly setting them to NA.
    gender = factor(gender, levels = c(0, 1), labels = c("Men", "Women"))
  )

# EDUCATION
# Original variable: (round 3) (q90) - What is the highest level of education you have completed? From 0 to 9 (0=No formal schooling, 1=Informal schooling (including Koranic schooling), 2=Some primary schooling, 3=Primary school completed, 4=Some secondary school/ High school, 5=Secondary school completed/High school, 6=Post-secondary qualifications, other than university e.g. a diploma or degree from a technical/polytechnic/college, 7=Some university, 8=University completed, 9=Post-graduate, 98=Refused to Answer, 99=Don’t Know, -1=Missing Data)
# New variable (edu) --> "Above the median" = 1, "Equal to or below the median"=0

afrob <-  afrob %>%
  mutate(educ = as.numeric(educ)) %>%
  mutate(
    edu = case_when(
      round == 1 & educ %in% 1:3 ~ educ,  
      round == 1 & educ %in% c(98, 99, 10) ~ NA_real_,
      round == 3 & educ %in% 1:3 ~ 1,
      round == 3 & educ %in% 4:5 ~ 2,
      round == 3 & educ %in% 6:9 ~ 3,
      round == 3 & educ %in% c(98, 99, -1) ~ NA_real_)
  )

afrob <- afrob %>%
  mutate(
    edu = case_when(
      edu %in% 0:1 ~ 0,
      edu %in% 2:3 ~ 1,
      TRUE ~ NA_real_
    ),
    edu = factor(edu, levels = c(0, 1), labels = c("Equal to or below the median", "Above the median"))
  )

# AGE
# Recode invalid/missing age values to NA
afrob <- afrob %>%
  mutate(age = ifelse(age < 0 | age > 99, NA, age)) # Ages above 99 are unreasonable (100,101,102,115,120,130,998,999)
```

```{r}
# Add a new variable
afrob <- afrob %>%
  mutate(countryyear = paste(country, year, sep = "_")) %>%
  relocate(countryyear, .after = country) # Move countryyear after country (3rd column)
```

```{r}
# Recode outcome variables
# Reelection
# Original variable --> q44b2 (reelection) - Question: Do you happen to know: How many times someone can be elected President/Prime Minister? - Round 3 (2005-2006), not asked in Zimbabwe. Values from 1 to 3 (1=Know but can’t remember, 2=Incorrect guess, 3=Correct name, 9=Don’t Know, 98=Refused to Answer, -1=Missing Data)

# Plurality party
# Original variable --> q44a2 (plurality party) - Question:  Do you happen to know: Which political party has the most seats in parliament/national assembly? - Round 3 (2005-2006), not asked in Zimbabwe. Values from 1 to 3 (1=Know but can’t remember, 2=Incorrect guess, 3=Correct name, 9=Don’t Know, 98=Refused to Answer, -1=Missing Data)

# Deputy president (two rounds): 
# Round 1 (1999-2001)--> original variable --> sckvp - Question: Name vice-president, not asked in Uganda. Values> 0,1,5 (0=Incorrect, 1=Correct, 5=Could not determine, 9=Don’t Know, 98=Refused to Answer, 99=Missing Data)
# Round 3 (2005-2006): original variable --> q43c2 - Question: Can you tell me the name of: The Deputy President/Vice President? Values from 1 to 3 (1=Know but can’t remember, 2=Incorrect guess, 3=Correct name, 9=Don’t Know, 98=Refused to Answer, -1=Missing Data).  Not asked in Zimbabwe and Benin has multiple correct answers (dropped)

afrob <- afrob %>%
  mutate(
    term = case_when(
      q44b2 %in% c(1, 2, 9, 998, -1) ~ 0,
      q44b2 == 3 ~ 1,
      TRUE ~ NA_real_
    ),
    party = case_when(
      q44a2 %in% c(1, 2, 9, 98, -1) ~ 0,
      q44a2 == 3 ~ 1,
      TRUE ~ NA_real_
    ),
    leader = case_when(
      round == 1 & sckvp %in% c(5, 9, 98, 99) ~ 0,
      round == 1 & sckvp == 1 ~ 1, 
      round == 3 & q43c2 %in% c(1, 2, 9, 98, -1) ~ 0,
      round == 3 & q43c2 == 3 ~ 1, 
      TRUE ~ NA_real_
    ),
    country = haven::as_factor(country)
  )
```

### Add institutions data
```{r}
# Institutions datatsets include information about the last time each institution changed in a country (reelection of the President/Prime Minister, the party with most seats in parliament/national assembly, and the Vice President or Deputy president)

reelection <- read_csv("Datasets/Institutions variables/PresidentReelection.csv")

# Reelection president/PM
reelection <- reelection %>%
  rename(year_changed_term = year_changed) %>%
  mutate(
    round = ifelse(!is.na(year_changed_term), 3, NA),  
    term_years = survey_year - year_changed_term  
  )

afrob <- afrob %>%
  left_join(reelection, by = c("country", "round"))

#Plurality party
plurality_party <- read_csv("Datasets/Institutions variables/PluralityParty.csv") 

plurality_party <- plurality_party %>%
  rename( year_changed_party =  year_gained_plurality) %>% 
  mutate(
    round = ifelse(!is.na(year_changed_party), 3, NA), 
    party_years = survey_year - year_changed_party  
  )

afrob <- afrob %>%
  left_join(plurality_party, by = c("country", "round"))

# Name of the vicepresident
deputy_pres <- read_csv("Datasets/Institutions variables/DeputyPresident_last.csv")

deputy_pres <- deputy_pres %>%
  rename( year_took_office =  took_office) %>% 
  mutate(leader_years = survey_year - year_took_office)  

afrob <- afrob %>%
  left_join(deputy_pres, by = c("country", "round"))

# Create asinh transformations for variables term and party (not VP)
afrob <- afrob %>% 
  mutate(term_years_asinh = asinh(term_years),
         party_years_asinh = asinh(party_years))
```

### Add country level controls
```{r}
# Select country level controls for gdp per capita, and democracy (two alternative sources - Polity and VDEM)

# 1) GDP per capita - Data comes from the World Bank for 1999,2000,2001, 2005 and 2006 depending on the country. Original dataset is available at  https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.PCAP.PP.CD&country=

# 2) Democracy Index corresponds to Polity. Original dataset is available at (https://www.systemicpeace.org/polityproject.html). Dataset includes measures for Polity and Polity 2 for all countries to which the data was available (Bahamas, Belize, and Barbados excluded due to lack of availability for the corresponding years). 

controls_afrob <- read_csv("Datasets/Control variables/controls_country_level_afrob.csv")[,1:8]

# log(GDP per capita) instead of GDP per capita
controls_afrob$gdp_log <- log(controls_afrob$gdp_per_capita)

#Variables selected for the analysis: log (gdp per capita) - gdp_log - and Polity V - polity2 -.

# 3) Alternative measure of democratic level (VDEM). Original dataset available at (https://github.com/vdeminstitute/vdemdata). Variables selected for the analysis: 'v2x_partipdem' (participatory democracy), 'v2x_delibdem' (deliberative democracy), and 'v2x_polyarchy' (electoral democracy).y
# Participatory democracy index - v2x_partipdem -  Original question: To what extent is the ideal of participatory democracy achieved?
# Deliberative democracy index - v2x_delibdem - Original question: To what extent is the ideal of deliberative democracy achieved?
# Electoral democracy index - v2x_polyarchy - Original Question: To what extent is the ideal of electoral democracy in its fullest sense achieved?

vdem_small <- vdem_small %>%
  dplyr::rename(country = pais)

# Merge 
controls_afrob <- controls_afrob %>% 
  left_join(vdem_small, by=c("country", "year"))
afrob <- afrob %>% 
 left_join(controls_afrob, by=c("country", "year"))
```

# Create Country-Year-Level Data
```{r}
# Presidential reelection
term_dt <- afrob %>%
  dplyr::select(year, country, combinwt, term, term_years, term_years_asinh, gdp_log, polity2, partipdem, delibdem, polyarchy)%>%
  filter(!is.na(term) & !is.na(term_years_asinh)) %>% # Remove rows where term AND term_years_since are NA
  svydesign(id = ~1, weights = ~combinwt, data = .)  

# Create the means the outcome variable by gender and education 
term_means = svyby(~term + term_years + term_years_asinh + gdp_log + polity2 + partipdem + delibdem + polyarchy, by = ~year + country, term_dt, svymean) 

#Plurality party
party_dt <- afrob %>%
  dplyr::select(year, country, combinwt, party, party_years, party_years_asinh, gdp_log, polity2, partipdem, delibdem, polyarchy) %>% 
  filter(!is.na(party) & !is.na(party_years_asinh)) %>% # Remove rows where party AND party_years_since are NA
  svydesign(id = ~1, weights = ~combinwt, data = .)  

# Create the means the outcome variable by gender and education 
party_means = svyby(~party + party_years + party_years_asinh + gdp_log + polity2 + partipdem + delibdem + polyarchy, by = ~year + country, party_dt, svymean) 

#VP leader
leader_dt <- afrob %>%
  dplyr::select(year, country, combinwt, leader, leader_years, gdp_log, polity2, partipdem, delibdem, polyarchy) %>% 
  filter(!is.na(leader) & !is.na(leader_years)) %>% # Remove rows where leader AND leader_years are NA
  svydesign(id = ~1, weights = ~combinwt, data = .)  

# Create the means the outcome variable by gender and education 
leader_means = svyby(~leader + leader_years + gdp_log + polity2 + partipdem + delibdem + polyarchy, by = ~year + country, leader_dt, svymean) 
```

```{r}
# Save the data in a separate folder 
# Define the data objects to be saved
data_objects <- list(term_means = term_means,
                    party_means = party_means,
                    leader_means = leader_means,
                    dt = afrob)

# Loop through data and filenames, writing CSV files
for (data_name in names(data_objects)) {
  file_path <- paste0("Datasets/Processed datasets/afrob/", data_name, ".csv")
  write.csv(data_objects[[data_name]], file_path, row.names = FALSE)
}
```

#CSES
```{r}
cses <- read_dta("Datasets/CSES_dataset.dta")
```

### Renaming and cleaning variables.
```{r}
# Recode demographic variables 

# Original variables --> C1004 (CSES survey country + year identifier), C2001 (age), C2002 (gender), C2003 (education)
# New variables: survey, age, gender, education
cses <- cses%>%
  rename(
    survey = C1004,
    age = C2001,
    gender = C2002,
    education = C2003
  )

# GENDER
# Creating binary variables and removing DKs from these variables
# Note that anything not 1 and 2 should be NA for gender.

cses <- cses %>%
  mutate(
    gender = as.numeric(gender),
    gender = dplyr::recode(gender, `1` = 0, `2` = 1, `7` = NA_real_, `9` = NA_real_, .default = NA_real_),
    gender = factor(gender, levels = c(0, 1), labels = c("Men", "Women"))
  )

# EDUCATION
# Note that education is different for module 4. The coding of education is different across the surveys, so subtract one (1) from variable education for those in module 4.

# Original variable: C3002
# New variable (edu) --> "Above the median" = 1, "Equal to or below the median"=0
# Recoding education and adjusting for module differences
cses <- cses %>%
  mutate(
    education_cont = car::recode(education, "97:100=NA; 96=1"),
    # Adjust education_cont for module 4 by subtracting 1
    education_cont = if_else(module == 4, education_cont - 1, education_cont)
  )

# Education (median) by country/year
cses <- cses %>%
  group_by(survey) %>%
  mutate(
    education_median = median(education_cont, na.rm = TRUE),
    edu= as.integer(education_cont > education_median)
  ) %>%
  ungroup()

cses <- cses %>%
  mutate(edu = factor(edu, levels = c(0, 1),
   labels = c("Equal to or below the median", "Above the median"))
  )

# AGE
cses <- cses %>%
  mutate(
    age = ifelse(age %in% c(999, 998, 997, 1, 2, 3, 4), NA, age)
  )
```


```{r}
# Outcome variables - A respondent's L-R placement for party X in the survey.
# Original variables : C3011_A (Party A), C3011_B (Party B), C3011_C (Party C).... C3011_I (Party I). Values from 0 to 10 are valid answers, 95='not heard of left-right', 96='not heard of party', 97='refused', 98='not sure',  99='missing

cses = cses %>%
  mutate(
    partyA = car::recode(
      C3011_A, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    ),
    partyB = car::recode(
      C3011_B, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    ),
    partyC = car::recode(
      C3011_C, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    ),
    partyD = car::recode(
      C3011_D, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    ),
    partyE = car::recode(
      C3011_E, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    ),
    partyF = car::recode(
      C3011_F, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    ),
    partyG = car::recode(
      C3011_G, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    ),
    partyH = car::recode(
      C3011_H, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    ),
    partyI = car::recode(
      C3011_I, 
      "0:10='valid';
      95='not heard of left-right';
      96='not heard of party';
      97='refused';
      98='not sure';
      99='missing'"
    )
  )
```


```{r}
# Some parties have 100% of responses for the category "missing" (99). Where DKs are so high, it more likely represents something about survey administration than about people's knowledge --> new variables that indicate the percentage of DKs for each party by country. 

# Variable C3011_[] instead of party[] because the former is numeric and the latter is character
cses <- cses%>%
  group_by(survey)%>%
  mutate(DK_percentageA = sum(C3011_A==99)/sum(!is.na(C3011_A)), #Party A
         DK_percentageB = sum(C3011_B==99)/sum(!is.na(C3011_B)), #Party B
         DK_percentageC = sum(C3011_C==99)/sum(!is.na(C3011_C)), #Party C
         DK_percentageD = sum(C3011_D==99)/sum(!is.na(C3011_D)), #Party D
         DK_percentageE = sum(C3011_E==99)/sum(!is.na(C3011_E)), #Party E
         DK_percentageF = sum(C3011_F==99)/sum(!is.na(C3011_F)), #Party F
         DK_percentageG = sum(C3011_G==99)/sum(!is.na(C3011_G)), #Party G
         DK_percentageH = sum(C3011_H==99)/sum(!is.na(C3011_H)), #Party H
         DK_percentageI = sum(C3011_I==99)/sum(!is.na(C3011_I)) #Party I
         )%>%
  ungroup()
```

### Party level data on clientelism
```{r}
# Party level data on clientelism - Kitschelt programmaticism dataset.
pty_dt <- read_csv("Datasets/Party level data/partylevel_20130907.csv") %>%
  dplyr::select(b15, cosalpo_4, cosalpo_3econ, country, party)

# Updated party data from CSES
lbs <-  read_csv("Datasets/Party level data/party_country_labs_updated.csv")

# Crosswalk between Kitschelt data and CSES data - contains party level information for all survey years analyzed. 
party_lbs <- read.csv("Datasets/Party level data/updated_parties_by_dataset_final_04252020.csv")

party_lbs_temp <- left_join(party_lbs, lbs, by=c("survey", "party"))
pty_lbs_dt = pty_dt %>% 
  rename(party_name = party) %>%
  right_join(y = party_lbs_temp, by = c("country","party_name"))
```

# Building the dataset 
```{r}
# Data with DK dummy including demographics -- each row is an individual's response on a particular party
part1 <- cses %>%
  dplyr::select(survey, contains("party"), starts_with("DK_percentage"), gender, age, edu) %>%
  pivot_longer(
    cols = -c(survey, starts_with("DK_percentage"), gender, age, edu),
    names_to = 'party',
    values_to = "response"
  ) %>%
  mutate(dk_dummy = 1 - as.numeric(response == "valid"))

code = cses %>%
  dplyr::select(survey, contains("C3011_"),starts_with("DK_percentage"),gender, age, edu) %>%
  gather(key = 'party', value = "code", -c(survey, starts_with("DK_percentage"),gender, age, edu))%>%
  mutate(party = str_replace(party,"C3011_","party")) %>% .$code
code[code>10] = NA

part1$code = code

pty_analysis_dt = part1 %>%
  inner_join(y = pty_lbs_dt, by = c("survey","party")) %>%
  rename(
    benefits = b15,                # easier names for variables
    prog_struc = cosalpo_4,
    prog_struc_econ = cosalpo_3econ
  )

#b15: how much a party relies on extending benefits to voters. As per Kitschelt, this is calculated by adding together values for types of benefits provided as follows: consumer good provision + preferential public benefits + employment opportunities + gov't contracts + regulatory proceedings (in analyses b15 is renamed to benefits)
#cosalpo_4: party's level of general programmatic structuration.
#cosalpo_3econ: party's level of economic issue programmatic structuration 

# Individual-party-level dataset with aggregated party-survey statistics included

pty_agg_dt <- pty_analysis_dt %>%
  group_by(survey, party) %>%
  summarize(
    partysize = median(size, na.rm = TRUE),
    lr_var = var(code, na.rm = TRUE),
    dk_prop = mean(dk_dummy, na.rm = TRUE),
    benefits = mean(benefits, na.rm = TRUE),
    prog_struc = mean(prog_struc, na.rm = TRUE),
    prog_struc_econ = mean(prog_struc_econ, na.rm = TRUE),
    across(starts_with("DK_percentage"), ~ mean(.x, na.rm = TRUE)),
    .groups = 'drop'
  ) %>%
  left_join(part1 %>% 
              dplyr::select(survey, party, dk_dummy, gender, age, edu), by = c("survey", "party"))

# Conditionally calculate DK percentages for specific parties
pty_agg_dt <- pty_agg_dt %>% 
  group_by(survey) %>%
  mutate(
    DK_percentageA = ifelse(party == "partyA", mean(DK_percentageA, na.rm = TRUE), NA),
    DK_percentageB = ifelse(party == "partyB", mean(DK_percentageB, na.rm = TRUE), NA),
    DK_percentageC = ifelse(party == "partyC", mean(DK_percentageC, na.rm = TRUE), NA),
    DK_percentageD = ifelse(party == "partyD", mean(DK_percentageD, na.rm = TRUE), NA),
    DK_percentageE = ifelse(party == "partyE", mean(DK_percentageE, na.rm = TRUE), NA),
    DK_percentageF = ifelse(party == "partyF", mean(DK_percentageF, na.rm = TRUE), NA),
    DK_percentageG = ifelse(party == "partyG", mean(DK_percentageG, na.rm = TRUE), NA),
    DK_percentageH = ifelse(party == "partyH", mean(DK_percentageH, na.rm = TRUE), NA),
    DK_percentageI = ifelse(party == "partyI", mean(DK_percentageI, na.rm = TRUE), NA)
  ) %>%
  ungroup()
```

```{r}
# Include mode of administration dummy variables - Dataset contains survey level information (not party).
survey_vars <- read.csv("Datasets/cses_survey_level_data_saved.csv", stringsAsFactors = FALSE) %>% 
  dplyr::select(survey_id, mail_any, internet_any, phone_any, person_any)

# And merge
pty_agg_dt <- left_join(pty_agg_dt, survey_vars, by = c("survey" = "survey_id"))

# Creating a new column that contains the percentage of DK (missing) for each party (by Country Year)
pty_agg_dt <- pty_agg_dt %>%
  rowwise() %>% 
  mutate(DK_percentage = sum(c(DK_percentageA, DK_percentageB, DK_percentageC, DK_percentageD, DK_percentageE, DK_percentageF, DK_percentageG, DK_percentageH, DK_percentageI), na.rm = TRUE))

# Drop the original variables for DK by party to clean the dataset
pty_agg_dt <- subset(pty_agg_dt, select = -c(DK_percentageA, DK_percentageB, DK_percentageC, DK_percentageD, DK_percentageE, DK_percentageF, DK_percentageG, DK_percentageH, DK_percentageI))

```

### Add party age data
```{r}
# Party age data comes primarily from the Political Party Database Project (Ribeiro & Locatelli, 2019), supplemented with data from the Members & Activists of Political Parties dataset (van Haute et al., 2018) and original data collection when necessary. 
age_dt <- read_csv("Datasets/Party level data/party_age_data.csv")
```

```{r}
pty_agg_dt <- dplyr::select(pty_agg_dt, -benefits, -prog_struc_econ)

pty_agg_dt <- pty_agg_dt %>%
  left_join(age_dt) %>% 
  mutate(
    party_age = age_diff_new, 
    party_age_asinh = asinh(party_age)  # Applying the inverse hyperbolic sine transformation
  )

cses_combined <- pty_agg_dt
```

# Create Country-Year-Level Data
```{r}
cses_party_mean <- 
cses_combined %>%
  dplyr::select(survey, party, partysize, dk_dummy, prog_struc, party_age, party_age_asinh, DK_percentage, mail_any, internet_any, phone_any, person_any)%>% # select variables that we will use for the plots and ols models
  group_by(survey,party)%>%
  summarize(partysize = mean(partysize, na.rm = T), partysize = mean(partysize, na.rm = T), prog_struc = mean(prog_struc, na.rm = T), party_age_asinh = mean (party_age_asinh, na.rm=T), dk_dummy = mean(dk_dummy, na.rm = T), DK_percentage = mean(DK_percentage,na.rm = T),mail_any= mean(mail_any, na.rm = T), internet_any = mean(internet_any, na.rm = T), phone_any= mean(phone_any, na.rm = T), person_any= mean(person_any, na.rm = T))%>% #estimate the mean
  ungroup()
```

```{r}
# Save the data
# Define the data objects to be saved
data_objects <- list(cses_party_mean = cses_party_mean,
                   dt3 = cses_combined)

for (data_name in names(data_objects)) {
  file_path <- paste0("Datasets/Processed datasets/cses/", data_name, ".csv")
  write.csv(data_objects[[data_name]], file_path, row.names = FALSE)
}
```


